## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
## 
##     recode
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

Question 1 - Linear regression with dummy variable [5 marks]

Consider the dataset “weatherhistory.csv”.

  1. Describe dependent variable pressure and independent variable temperature;
  2. Explore the relationship between pressure and temperature;
  3. Build a linear model considering temperature and pressure;
  4. Identify a dummy variable and build extended model considering dummy variable;
  5. Report your findings;

Answer

a) Describe dependent variable pressure and independent variable temperature.

Load the dataset

weatherHistory <- read.csv("weatherHistory.csv")
str(weatherHistory)
## 'data.frame':    96452 obs. of  13 variables:
##  $ X                  : int  2 3 4 5 6 7 8 9 10 11 ...
##  $ time               : chr  "2006-04-01 01:00:00.000 +0200" "2006-04-01 02:00:00.000 +0200" "2006-04-01 03:00:00.000 +0200" "2006-04-01 04:00:00.000 +0200" ...
##  $ summary            : chr  "Partly Cloudy" "Mostly Cloudy" "Partly Cloudy" "Mostly Cloudy" ...
##  $ precipType         : chr  "rain" "rain" "rain" "rain" ...
##  $ temperature        : num  9.36 9.38 8.29 8.76 9.22 ...
##  $ apparentTemperature: num  7.23 9.38 5.94 6.98 7.11 ...
##  $ humidity           : num  0.86 0.89 0.83 0.83 0.85 0.95 0.89 0.82 0.72 0.67 ...
##  $ windSpeed          : num  14.26 3.93 14.1 11.04 13.96 ...
##  $ windBearing        : int  259 204 269 259 258 259 260 259 279 290 ...
##  $ visibility         : num  15.8 15 15.8 15.8 15 ...
##  $ loudCover          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ pressure           : num  1016 1016 1016 1017 1017 ...
##  $ dailySummary       : chr  "Partly cloudy throughout the day." "Partly cloudy throughout the day." "Partly cloudy throughout the day." "Partly cloudy throughout the day." ...
head(weatherHistory)
##   X                          time       summary precipType temperature
## 1 2 2006-04-01 01:00:00.000 +0200 Partly Cloudy       rain    9.355556
## 2 3 2006-04-01 02:00:00.000 +0200 Mostly Cloudy       rain    9.377778
## 3 4 2006-04-01 03:00:00.000 +0200 Partly Cloudy       rain    8.288889
## 4 5 2006-04-01 04:00:00.000 +0200 Mostly Cloudy       rain    8.755556
## 5 6 2006-04-01 05:00:00.000 +0200 Partly Cloudy       rain    9.222222
## 6 7 2006-04-01 06:00:00.000 +0200 Partly Cloudy       rain    7.733333
##   apparentTemperature humidity windSpeed windBearing visibility loudCover
## 1            7.227778     0.86   14.2646         259    15.8263         0
## 2            9.377778     0.89    3.9284         204    14.9569         0
## 3            5.944444     0.83   14.1036         269    15.8263         0
## 4            6.977778     0.83   11.0446         259    15.8263         0
## 5            7.111111     0.85   13.9587         258    14.9569         0
## 6            5.522222     0.95   12.3648         259     9.9820         0
##   pressure                      dailySummary
## 1  1015.63 Partly cloudy throughout the day.
## 2  1015.94 Partly cloudy throughout the day.
## 3  1016.41 Partly cloudy throughout the day.
## 4  1016.51 Partly cloudy throughout the day.
## 5  1016.66 Partly cloudy throughout the day.
## 6  1016.72 Partly cloudy throughout the day.
colnames(weatherHistory)
##  [1] "X"                   "time"                "summary"            
##  [4] "precipType"          "temperature"         "apparentTemperature"
##  [7] "humidity"            "windSpeed"           "windBearing"        
## [10] "visibility"          "loudCover"           "pressure"           
## [13] "dailySummary"
ggplot(weatherHistory, aes(y = temperature)) +
  geom_boxplot(fill = "lightblue", colour = "darkblue") +
  labs(title = "Boxplot of Temperature", y = "Temperature") +
  theme_minimal()

ggplot(weatherHistory, aes(y = pressure)) +
  geom_boxplot(fill = "lightblue", colour = "darkblue") +
  labs(title = "Boxplot of Temperature", y = "Temperature") +
  theme_minimal()

ggplot(weatherHistory, aes(x = temperature)) +
  geom_histogram(bins = 60, fill = "skyblue", color = "black") + # You can adjust the number of bins
  labs(title = "Histogram of Temperature", x = "Temperature (°C)", y = "Frequency") +
  theme_minimal()

ggplot(weatherHistory, aes(x = pressure)) +
  geom_histogram(bins = 200, fill = "lightgreen", color = "black") + # Adjust the number of bins as needed
  labs(title = "Histogram of Pressure", x = "Pressure (hPa)", y = "Frequency") +
  theme_minimal()

psych::describe(weatherHistory[, c("temperature", "pressure")])
##             vars     n    mean     sd  median trimmed   mad    min     max
## temperature    1 96452   11.93   9.55   12.00   11.79 10.40 -21.82   39.91
## pressure       2 96452 1003.24 116.97 1016.45 1016.54  6.81   0.00 1046.38
##               range  skew kurtosis   se
## temperature   61.73  0.09    -0.57 0.03
## pressure    1046.38 -8.42    69.26 0.38

Temperature has the mean 11.93 and the a standard deviation of 9.55. With this standard deviation showing significant fluctuations in the temperature. With the kurtosis of -0.57 indicates a distribution flatter than a normal distribution. We also have 0.03 of standard error of the mean that the measure is precise. We can conclude that temperature is a symmetric and a flatter variable.

Pressure data in the other hand shows a skew data with a lot of zero values. Hit has a kurtosis of 69.26 indicate thick tails and a skew of -8.42 indicating the data is concentrated in the right side of the mean.

Testing the temperature for normality

ggplot(weatherHistory, aes(sample=temperature)) + stat_qq()

ad.test(weatherHistory$temperature)
## 
##  Anderson-Darling normality test
## 
## data:  weatherHistory$temperature
## A = 202.38, p-value < 2.2e-16
ad.test(weatherHistory$pressure)
## 
##  Anderson-Darling normality test
## 
## data:  weatherHistory$pressure
## A = 31503, p-value < 2.2e-16

We can also see that neither temperature or pressure follows a normal distribution.

b) Explore the relationship between pressure and temperature.

To explore the relationship between pressure and temperature.

ggplot(weatherHistory, aes(x = temperature, y = pressure)) +
  geom_point(alpha = 0.5) +  # using some transparency for points
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) +  
  labs(title = "Temperature vs. Pressure",
       x = "Temperature",
       y = "Pressure") +
  theme_minimal()

It seems that the pressure outliers are affecting the relationship between temperature and pressure. Let’s remove them e see the correlation.

weatherHistoryWithPressure <- weatherHistory %>%  
    filter(pressure != 0) %>%  
    filter(!is.na(temperature) & !is.na(pressure))


ggplot(weatherHistoryWithPressure, aes(x = temperature, y = pressure)) +
  geom_point(alpha = 0.5) +  # using some transparency for points
  geom_smooth(method = "lm", se = FALSE, color = "blue", formula = y ~ x) + 
  labs(title = "Temperature vs. Pressure",
       x = "Temperature",
       y = "Pressure") +
  theme_minimal()

Let’s also do a Pearson’s correlation between the variables.

cor.test(weatherHistoryWithPressure$temperature, weatherHistoryWithPressure$pressure)
## 
##  Pearson's product-moment correlation
## 
## data:  weatherHistoryWithPressure$temperature and weatherHistoryWithPressure$pressure
## t = -100.75, df = 95162, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3161859 -0.3047036
## sample estimates:
##        cor 
## -0.3104561

The scatterplot suggest that we have a non-linear relationship between temperature and pressure. So doing a linear regression model will not capture or explain the relationship between them.

The correlation coefficient from the Pearson’s test indicates a negative relationship of 0.31. So it means when the temperature increases the pressure decreases.

The p-value is also too low (2.2e-16) suggesting the relationship is statistical significant.

c) Build a linear model considering temperature and pressure.

First let’s create the model.

lmTemperaturePressure <-  lm(pressure ~ temperature, 
                             data = weatherHistoryWithPressure)
anova(lmTemperaturePressure)
## Analysis of Variance Table
## 
## Response: pressure
##                Df  Sum Sq Mean Sq F value    Pr(>F)    
## temperature     1  554943  554943   10150 < 2.2e-16 ***
## Residuals   95162 5202744      55                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(lmTemperaturePressure)
## 
## Call:
## lm(formula = pressure ~ temperature, data = weatherHistoryWithPressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -44.223  -4.050   0.450   4.501  25.527 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.020e+03  3.840e-02 26557.4   <2e-16 ***
## temperature -2.530e-01  2.511e-03  -100.7   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.394 on 95162 degrees of freedom
## Multiple R-squared:  0.09638,    Adjusted R-squared:  0.09637 
## F-statistic: 1.015e+04 on 1 and 95162 DF,  p-value: < 2.2e-16

Standartize co-efficients to get the result expressed in standart deviations.

lm.beta(lmTemperaturePressure)  
## 
## Call:
## lm(formula = pressure ~ temperature, data = weatherHistoryWithPressure)
## 
## Standardized Coefficients::
## (Intercept) temperature 
##          NA  -0.3104561
suppressWarnings(stargazer(lmTemperaturePressure, type="text"))
## 
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                               pressure           
## -------------------------------------------------
## temperature                   -0.253***          
##                                (0.003)           
##                                                  
## Constant                    1,019.837***         
##                                (0.038)           
##                                                  
## -------------------------------------------------
## Observations                   95,164            
## R2                              0.096            
## Adjusted R2                     0.096            
## Residual Std. Error      7.394 (df = 95162)      
## F Statistic         10,150.310*** (df = 1; 95162)
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01

So we got the following results:

  • For every 1 degree increase in temperature the pressure decreases 0.253 units.
  • The p value is 0.01 who give us 99% confidence level. Also our F statistic is 10,150.310. Suggesting that the temperature is a good predictor of pressure.
  • R squared is 0.096 meaning that 9.6% of the pressure variance is explained by the temperature.
plot(residuals(lmTemperaturePressure), main = "Residuals of the Model",
     xlab = "Index", ylab = "Residuals")
abline(h = 0, col = "red")

plot(lmTemperaturePressure)

Looking at the residuals we can see that there is a funneling effect showing a spread out as the fitted values increase suggesting a non-constant variance. We get a bigger residuals for lower or higher pressures.

d) Identify a dummy variable and build extended model considering dummy variable.

Let’s use the categorical variable precipType as a dummy variable for our model

weatherHistoryDummy <- weatherHistory
weatherHistoryDummy$humidityScale <- cut(weatherHistory$windSpeed, 
                                     breaks=c(0, 0.25, 0.6, 1), 
                                     labels=c("dry", "comfortable", "humid"))
levels(weatherHistoryDummy$humidityScale)
## [1] "dry"         "comfortable" "humid"
table(weatherHistoryDummy$humidityScale)
## 
##         dry comfortable       humid 
##         293         453         268

So we have we possible values: dry, rain and snow.

Mapping them in our dataset.

weatherHistoryDummy$dry <- ifelse(weatherHistoryDummy$humidityScale == "dry", 1, 0)
weatherHistoryDummy$comfortable <- ifelse(weatherHistoryDummy$humidityScale == "comfortable", 1, 0)
weatherHistoryDummy$humid <- ifelse(weatherHistoryDummy$humidityScale == "humid", 1, 0)

table(weatherHistoryDummy[, c("humidityScale")])
## 
##         dry comfortable       humid 
##         293         453         268
table(weatherHistoryDummy[, c("dry")])
## 
##   0   1 
## 721 293
table(weatherHistoryDummy[, c("comfortable")])
## 
##   0   1 
## 561 453
table(weatherHistoryDummy[, c("humid")])
## 
##   0   1 
## 746 268

Lets consider the humid dummy variable and how it influenciates the pressure.

lmTempPresWithDummy <- lm(pressure ~ temperature + humid, data = weatherHistoryDummy)

e) Report your findings.

summary(lmTempPresWithDummy)
## 
## Call:
## lm(formula = pressure ~ temperature + humid, data = weatherHistoryDummy)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1015.12     3.20     8.10    14.09    33.55 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1010.01243    4.91890 205.333   <2e-16 ***
## temperature   -0.08503    0.30486  -0.279    0.780    
## humid          6.05245    6.84406   0.884    0.377    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 96.1 on 1011 degrees of freedom
##   (95438 observations deleted due to missingness)
## Multiple R-squared:  0.0008502,  Adjusted R-squared:  -0.001126 
## F-statistic: 0.4302 on 2 and 1011 DF,  p-value: 0.6505
lm.beta(lmTempPresWithDummy)  
## 
## Call:
## lm(formula = pressure ~ temperature + humid, data = weatherHistoryDummy)
## 
## Standardized Coefficients::
##  (Intercept)  temperature        humid 
##           NA -0.008768578  0.027800804
suppressWarnings(stargazer(lmTempPresWithDummy, type="text"))
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                              pressure          
## -----------------------------------------------
## temperature                   -0.085           
##                               (0.305)          
##                                                
## humid                          6.052           
##                               (6.844)          
##                                                
## Constant                   1,010.012***        
##                               (4.919)          
##                                                
## -----------------------------------------------
## Observations                   1,014           
## R2                             0.001           
## Adjusted R2                   -0.001           
## Residual Std. Error     96.102 (df = 1011)     
## F Statistic            0.430 (df = 2; 1011)    
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

If we consider the humidity as dummy variable we get the model equaltion: \[ pressure = 1010.012 - 0.085 \times temperature + 6.052 \times humid \]

plot(lmTempPresWithDummy)

Conclusions:

  • The pressure is 1010.012 when the temperature and humidity is 0;
  • The pressure drops 0.085 per degree increase in temperature;
  • When the weather is humid the pressure increases 6.052;
  • R-squared is less than 0.001 indicating that the model explain less than 1% of the variance to the pressure;
  • A negative adjusted R-squared suggests that the model fits the data worse than just the mean of the dependent variable.
  • F-statistic is 0.4302 with a p-value of 0.6505, indicating that the model is not statistically significant.

Question 2 - Multiple Linear Regression [6 marks]

Consider the dataset “weatherhistory.csv”.

  1. Explore the relationship between pressure and windspeed;
  2. Build a linear model considering (windspeed, humidity, temperature) and pressure;
  3. Assess how model meets key assumptions of linear regression;
  4. Investigate a differential effect by adding dummy variable;
  5. Investigate interaction between effect for windspeed and dummy variable;
  6. Report your findings.

Answer

a) Explore the relationship between pressure and windspeed

First we create a scatterplot to visualize the correlation between pressure and windspeed.

ggplot(weatherHistory, aes(x = windSpeed, y = pressure)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "blue", formula="y ~ x") +
  labs(title = "Pressure vs Wind Speed",
       x = "Wind Speed",
       y = "Pressure") +
  theme_minimal()

We have a lot of points with pressure equals 0. So let’s remove the outliers.

ggplot(weatherHistoryWithPressure, aes(x = windSpeed, y = pressure)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", color = "blue", formula="y ~ x") +
  labs(title = "Pressure vs Wind Speed",
       x = "Wind Speed",
       y = "Pressure") +
  theme_minimal()

Let’s also do a Pearson correaltion test

cor.test(weatherHistoryWithPressure$pressure, weatherHistoryWithPressure$windSpeed)
## 
##  Pearson's product-moment correlation
## 
## data:  weatherHistoryWithPressure$pressure and weatherHistoryWithPressure$windSpeed
## t = -80.909, df = 95162, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2596340 -0.2477448
## sample estimates:
##       cor 
## -0.253699

Considerations;

  • The regression line, shown in blue, indicates a negative linear relationship between wind speed and pressure. As wind speed increases, pressure tends to decrease;
  • The data points are concentrated for lower wind speeds suggesting higher variance for higher wind speeds. Also its uncommon wind speeds higher than 40;
  • The negative correlation of 0.25 suggest indicates a week relationship between both variables;
  • The p-value of 2.2e-16 for a confidence level of 95% suggests that the correlation is statistical significant.

b) Build a linear model considering (windspeed, humidity, temperature) and pressure

modelPressure <- lm(pressure ~ windSpeed + humidity +  temperature, data = weatherHistoryWithPressure)
summary(modelPressure)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -45.153  -3.811   0.382   4.172  24.327 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.037e+03  1.526e-01  6796.3   <2e-16 ***
## windSpeed   -3.771e-01  3.323e-03  -113.5   <2e-16 ***
## humidity    -1.525e+01  1.512e-01  -100.9   <2e-16 ***
## temperature -4.479e-01  3.019e-03  -148.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.779 on 95160 degrees of freedom
## Multiple R-squared:  0.2404, Adjusted R-squared:  0.2404 
## F-statistic: 1.004e+04 on 3 and 95160 DF,  p-value: < 2.2e-16

Standartize co-efficients to get the result expressed in standart deviations.

lm.beta(modelPressure)
## 
## Call:
## lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)
## 
## Standardized Coefficients::
## (Intercept)   windSpeed    humidity temperature 
##          NA  -0.3341233  -0.3835520  -0.5497544
suppressWarnings(stargazer(modelPressure, type="text"))
## 
## =================================================
##                          Dependent variable:     
##                     -----------------------------
##                               pressure           
## -------------------------------------------------
## windSpeed                     -0.377***          
##                                (0.003)           
##                                                  
## humidity                     -15.253***          
##                                (0.151)           
##                                                  
## temperature                   -0.448***          
##                                (0.003)           
##                                                  
## Constant                    1,037.444***         
##                                (0.153)           
##                                                  
## -------------------------------------------------
## Observations                   95,164            
## R2                              0.240            
## Adjusted R2                     0.240            
## Residual Std. Error      6.779 (df = 95160)      
## F Statistic         10,037.910*** (df = 3; 95160)
## =================================================
## Note:                 *p<0.1; **p<0.05; ***p<0.01

We got the model equation: \[ pressure = 1037.444 - 0.377 \times windSpeed - 15.253 \times humidity - 0.448 \times temperature \] We can conclude the following:

  • Intercept 1037.444. Value expected when all predictors are zt zero.
  • All predictors are negative indicating then when they increase the pressure will drop for every unit each (0.377 windSpeed, 15.256 humidity and 0.448 temperature);
  • The humidity is the less influential in the pressure because it ranges between 0 and 1;
  • All the variables have a p-value lower tha 0.01 meaning that we have a confidence level bigger than 99% that they’re statistical significant;
  • The \(R^2\) and \(Ajusted\ R^2\) are 0.24 meaning that this model explain 24% of the pressure variance;
  • We have a residual standart error of approximately 6.779. Considering that the pressure ranges are bigger than 1000 is a small error;
  • The F Statistic 10,037.910 is a high and significant (p < 0.01) value confirm that is a statistic model.

c) Assess how model meets key assumptions of linear regression.

Linear relationship between the response variable and the predictors

linearity <- weatherHistoryWithPressure |> mutate(yhat = fitted(modelPressure), res = residuals(modelPressure))

ggplot(linearity, aes(x = yhat, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "Fitted vs Residuals",x = "Fitted",y = "Residuals")

ggplot(linearity, aes(x = windSpeed, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "WindSpeed vs Residuals",x = "WindSpeed",y = "Residuals")

ggplot(linearity, aes(x = humidity, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "humidity vs Residuals",x = "humidity",y = "Residuals")

ggplot(linearity, aes(x = temperature, y = res)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red", linetype = "dashed") +
  geom_smooth(method = "gam", color = "blue", se = FALSE, formula = y ~ s(x, bs = "cs")) +
  labs(title = "temperature vs Residuals",x = "temperature",y = "Residuals")

We got the following results:

  • The first chart shows the Fitted Values vs Residuals. It shows a curve in the tails (lower and higher pressures), not suggesting that the relationship between the predictors and the dependent variable is not linear.

  • The chart for the WindSpeed is fairly linear until we get the 30 Km/h but after that we get a positive curve not suggesting a linear relation for higher windspeeds with the variance increasing.

  • In the other side we have the humidity. The variance decreases with lower humidity values (<0.25) and after is fairly constant suggesting a linear relationship for humidity >= 0.25.

  • The temperature variable agains the residuals show a sightly curve around the 0 line. Show we can say that the temperature is somewhat linear.

The result is that this model is not a linear model. We can improve it removing the outliers or do some kind of transformation.

Independence of errors

plot(resid(modelPressure), type = 'l', main = "Residuals Plot",
     xlab = "Index", ylab = "Residuals")
abline(h = 0, col = "red")

The Residuals Plot isplays the residuals from your linear regression model plotted against their index, which represents the order in which data were collected (you can confirm by the time of the data tha is in order).

As we can see the model does not presents obvious bias estimating the dependent variable.

We can assume that the model is independent of errors.

Homoscedasticity

The variance of the residuals should remain constant across all levels of the independent variables.

By the previous Residuals Plots we see that they present a funneling shape for all variables suggesting that the model exhibits heteroscedasticity. To confirm we can do Breusch-Pagan test.

bptest(modelPressure)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelPressure
## BP = 6388.8, df = 3, p-value < 2.2e-16

Given the result of the test we can conclude that with a small p-value (2.2e-16) we have strong evidence that the variance of the residuals is not constant.

Normality of Errors

Residuals should be approximately normally distributed. To verify we will do the histogram for the residuals

ggplot(linearity, aes(x = res)) +
  geom_histogram(aes(y = after_stat(density)), bins = 50) +
  ggtitle("Histogram of Residuals") +
  xlab("Residuals") +
  ylab("Density") +
  stat_function(fun = dnorm, args = list(mean = mean(linearity$res), sd = sd(linearity$res)), color = "blue", linewidth = 1) +
  theme_minimal()

qqnorm(linearity$res, main = "Q-Q Plot of Residuals")
qqline(linearity$res, col = "red")

ad.test(linearity$res)
## 
##  Anderson-Darling normality test
## 
## data:  linearity$res
## A = 331.88, p-value < 2.2e-16

By the qqplot and the result of Anderson-Darling normality test we can conclude that the residuals don’t not follow a normal distribution.

ad.test(linearity$res)
## 
##  Anderson-Darling normality test
## 
## data:  linearity$res
## A = 331.88, p-value < 2.2e-16

No Multicollinearity among Predictors

To measure how much the variance of an estimated regression coefficient increases if the predictors are correlated.

So if the Vif results can be interpreted based on the results per variable. If VIF equals:

  • 1: that there is no correlation between a given predictor with others;
  • = 1 && <= 5: Moderate correlation but no severe;

  • 5: Higly correlated.

vif(modelPressure)
##   windSpeed    humidity temperature 
##    1.086068    1.811151    1.720221
vif(modelPressure)
##   windSpeed    humidity temperature 
##    1.086068    1.811151    1.720221

Based on our results or predictors are Lightly correlate between them not affecting our predictive model. In other words: each coefficient produced by the regression model is likely reliable.

No Perfect Multicollinearity

cor(weatherHistoryWithPressure[c("windSpeed", "humidity", "temperature")])
##               windSpeed   humidity temperature
## windSpeed    1.00000000 -0.2242867  0.01018877
## humidity    -0.22428667  1.0000000 -0.63277644
## temperature  0.01018877 -0.6327764  1.00000000

Looking at the correlation matrix we only have one moderate negative correlation, that is humidity with temperature (-0.633). This value is significant but not indicative of high multicollinearity.

So we can conclude that there is no Perfect Multicollinearity in our model.

d) Investigate a differential effect by adding dummy variable

First lets create a dummy variable. We consider to create a variable to verify the day is cloudy or not.

weatherHistoryWithPressure$cloudy <- ifelse(grepl("cloudy", tolower(weatherHistoryWithPressure$summary)), 1, 0)
table(weatherHistoryWithPressure$cloudy)
## 
##     0     1 
## 34449 60715

Comparing the results between the models (Simple and With the Dummy variable).

modelPressureSimple <- lm(pressure ~ windSpeed + humidity +  temperature , data = weatherHistoryWithPressure)
lm.beta(modelPressureSimple)

Call: lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)

Standardized Coefficients:: (Intercept) windSpeed humidity temperature NA -0.3341233 -0.3835520 -0.5497544

modelPressureDummy <- lm(pressure ~ windSpeed + humidity +  temperature , data = weatherHistoryWithPressure)
lm.beta(modelPressureDummy)

Call: lm(formula = pressure ~ windSpeed + humidity + temperature, data = weatherHistoryWithPressure)

Standardized Coefficients:: (Intercept) windSpeed humidity temperature NA -0.3341233 -0.3835520 -0.5497544

suppressWarnings(stargazer(modelPressureSimple, modelPressureDummy, type="html", title= "Regression Results", align = T))
Regression Results
Dependent variable:
pressure
(1) (2)
windSpeed -0.377*** -0.377***
(0.003) (0.003)
humidity -15.253*** -15.253***
(0.151) (0.151)
temperature -0.448*** -0.448***
(0.003) (0.003)
Constant 1,037.444*** 1,037.444***
(0.153) (0.153)
Observations 95,164 95,164
R2 0.240 0.240
Adjusted R2 0.240 0.240
Residual Std. Error (df = 95160) 6.779 6.779
F Statistic (df = 3; 95160) 10,037.910*** 10,037.910***
Note: p<0.1; p<0.05; p<0.01